Lesson 4


Scatterplots and Perceived Audience Size

Scatterplots

library('ggplot2')
pf <- read.csv('./pseudo_facebook.tsv', sep = '\t')

# Scatterplot
qplot(data = pf, x = age, y = friend_count)


What are some things that you notice right away?

Response: Younger people have more friends than the others


ggplot Syntax

ggplot(data = pf, aes(x = age, y = friend_count)) +
  geom_point(na.rm = TRUE) + 
  scale_x_continuous(breaks = seq(13,90,5), limits = c(13,90))


Overplotting

Notes: Bottom part of the plot has too many points which makes it harder to understand

# Adding Transparency
# 20 points makes one black point
ggplot(data = pf, aes(x = age, y = friend_count)) +
  #geom_point(alpha = 1/20, na.rm = TRUE) +
  geom_jitter(alpha = 1/20, na.rm = TRUE) +
  scale_x_continuous(breaks = seq(13,90,5), limits = c(13,90))

What do you notice in the plot?

Response: Most people have around 1000-2000 friends ***

Coord_trans()

Notes:

ggplot(data = pf, aes(x = age, y = friend_count + 1)) +
  geom_point(alpha = 1/20, na.rm = TRUE) +
  scale_x_continuous(breaks = seq(13,90,5), limits = c(13,90)) +
  coord_trans(y = 'sqrt') + 
  ylab('Friend Count')


Alpha and Jitter

Notes:

ggplot(data = pf, aes(x = age, y = pf$friendships_initiated)) +
  geom_jitter(alpha = 1/10, na.rm = TRUE) +
  scale_x_continuous(breaks = seq(13,90,10), limits = c(13,90)) +
  scale_y_continuous(limits = c(0,2000), breaks = seq(0,2000,500)) +
  ylab('Friendships Initiated')

# Alternate Syntax
ggplot(data = pf, aes(x = age, y = pf$friendships_initiated)) +
  geom_point(alpha = 1/10, position = 'jitter', na.rm = TRUE) +
  scale_x_continuous(breaks = seq(13,90,10), limits = c(13,90)) +
  scale_y_continuous(limits = c(0,2000), breaks = seq(0,2000,500)) +
  ylab('Friendships Initiated')


Overplotting and Domain Knowledge

Notes: Used Percentage for Axes to study the data ***

Conditional Means

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# age_by_group <- group_by(pf, age)
# pf.fc_by_age <- summarise(
#   age_by_group,
#   friend_count_mean = mean(friend_count),
#   friend_count_median = median(friend_count),
#   n = n()
# )
# 
# pf.fc_by_age <-  arrange(pf.fc_by_age)

# Using pipes in R
pf.fc_by_age <- pf %>% 
                group_by(age) %>% 
                summarise(
                friend_count_mean = mean(friend_count),
                friend_count_median = median(friend_count),
                n = n()) %>%  
                arrange()

head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13              165.                 74    484
## 2    14              251.                132   1925
## 3    15              348.                161   2618
## 4    16              352.                172.  3086
## 5    17              350.                156   3283
## 6    18              331.                162   5196
ggplot(data = pf.fc_by_age, 
       mapping = aes(x = age, friend_count_mean)) +
  geom_point(na.rm = TRUE) +
  scale_x_continuous(breaks = seq(13,90,10), limits = c(13,90)) +
  scale_y_continuous(breaks = seq(0,450,50), limits = c(0,450))

# Using geom_line() to see a trend

ggplot(data = pf.fc_by_age, 
       mapping = aes(x = age, friend_count_mean)) +
  geom_line(na.rm = TRUE) +
  scale_x_continuous(breaks = seq(13,90,10), limits = c(13,90)) +
  scale_y_continuous(breaks = seq(0,450,50), limits = c(0,450))


Overlaying Summaries with Raw Data

ggplot(data = pf, aes(x = age, y = friend_count + 1)) +
  geom_point(alpha = 1/20, color = 'orange') +
  scale_x_continuous(breaks = seq(13,90,5), limits = c(13,90)) +
  coord_trans(y = 'sqrt') + 
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = median) +
  geom_line(stat = 'summary', fun.y = quantile,
            fun.args = list(probs = .9), color = 'red', linetype = 2) +
  geom_line(stat = 'summary', fun.y = quantile,
            fun.args = list(probs = .1), color = 'red', linetype = 2) +
  ylab('Friend Count')

Correlation

Notes:

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:


Correlation on Subsets

Notes:

# Checking correlation
#cor.test(pf$age, pf$friend_count)
with(pf, cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Correlation Methods

Notes: Use Pearson by default


Create Scatterplots

ggplot(data = pf, mapping = 
         aes(x = likes_received, y = www_likes_received)) +
  geom_point()


Strong Correlations

Notes:

ggplot(data = pf, mapping = 
         aes(x = likes_received, y = www_likes_received)) +
  geom_point(alpha = 1/20, na.rm = TRUE) +
  xlim(0, quantile(pf$likes_received, 0.95)) +
  ylim(0, quantile(pf$www_likes_received, 0.95)) +
  geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

cor.test(pf$likes_received, pf$www_likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$likes_received and pf$www_likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response:


Moira on Correlation

Notes:


More Caution with Correlation

Notes:

#install.packages('alr3')
library(alr3)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data("Mitchell")
summary(Mitchell)
##      Month             Temp        
##  Min.   :  0.00   Min.   :-7.4778  
##  1st Qu.: 50.75   1st Qu.:-0.3486  
##  Median :101.50   Median :10.4500  
##  Mean   :101.50   Mean   :10.3125  
##  3rd Qu.:152.25   3rd Qu.:20.4306  
##  Max.   :203.00   Max.   :27.6056

Create your plot!

ggplot(data = Mitchell, mapping = aes(x = Month, y = Temp)) +
  geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot. 0.01

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

cor.test(Mitchell$Month, Mitchell$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes:

ggplot(data = Mitchell, mapping = aes(x = Month, y = Temp)) +
  geom_point(color = 'red') + 
  scale_x_continuous(breaks = seq(0,203,12), 
                     limits = c(0,205))


A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:


Age with Months Means

Programming Assignment


Noise in Conditional Means


Smoothing Conditional Means

Notes:


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection:


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!